# Computations
import numpy as np
import pandas as pd
import pickle
# preprocessing
from sklearn import preprocessing
import re
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com. This data is fictional and it is created by IBM data scientists.
Categorical Parameters:
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| Education | Below College | College | Bachelor | Master | Doctor |
| Environment Satisfaction | Low | Medium | High | Very High | |
| Job Involvement | Low | Medium | High | Very High | |
| Job Satisfaction | Low | Medium | High | Very High | |
| Performance Rating | Low | Good | Excellent | Outstanding | |
| Relationship Satisfaction | Low | Medium | High | Very High | |
| WorkLife Balance | Bad | Good | Better | Best |
This can be encoded as follows,
Path = 'Data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx'
Data = pd.read_excel(Path)
Temp = [re.sub(r"(\w)([A-Z])", r"\1 \2", x) for x in Data.columns]
Temp = [x.replace(' Curr ', ' Current ').replace('18',' 18').replace('Num ','Number Of ') for x in Temp]
Data.columns = Temp
del Temp
Data['Business Travel'] = Data['Business Travel'].str.replace('_',' ')
display(Data.head(8).style.hide_index())
Target = 'Attrition'
Featured_Columns = list(set(Data.columns) - {Target, 'Employee Number'})
# Dictionaries
with open(Path.split(".")[0] + '_Categorical_Dict.pkl', 'rb') as fp:
Categorical_Dict = pickle.load(fp)
with open(Path.split(".")[0] + '_Bin_Dict.pkl', 'rb') as fp:
Bin_Dict = pickle.load(fp)
del fp
| Age | Attrition | Business Travel | Daily Rate | Department | Distance From Home | Education | Education Field | Employee Count | Employee Number | Environment Satisfaction | Gender | Hourly Rate | Job Involvement | Job Level | Job Role | Job Satisfaction | Marital Status | Monthly Income | Monthly Rate | Number Of Companies Worked | Over 18 | Over Time | Percent Salary Hike | Performance Rating | Relationship Satisfaction | Standard Hours | Stock Option Level | Total Working Years | Training Times Last Year | Work Life Balance | Years At Company | Years In Current Role | Years Since Last Promotion | Years With Current Manager |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | Yes | Travel Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 49 | No | Travel Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 37 | Yes | Travel Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 33 | No | Travel Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 27 | No | Travel Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| 32 | No | Travel Frequently | 1005 | Research & Development | 2 | 2 | Life Sciences | 1 | 8 | 4 | Male | 79 | 3 | 1 | Laboratory Technician | 4 | Single | 3068 | 11864 | 0 | Y | No | 13 | 3 | 3 | 80 | 0 | 8 | 2 | 2 | 7 | 7 | 3 | 6 |
| 59 | No | Travel Rarely | 1324 | Research & Development | 3 | 3 | Medical | 1 | 10 | 3 | Female | 81 | 4 | 1 | Laboratory Technician | 1 | Married | 2670 | 9964 | 4 | Y | Yes | 20 | 4 | 1 | 80 | 3 | 12 | 3 | 2 | 1 | 0 | 0 | 0 |
| 30 | No | Travel Rarely | 1358 | Research & Development | 24 | 1 | Life Sciences | 1 | 11 | 4 | Male | 67 | 3 | 1 | Laboratory Technician | 3 | Divorced | 2693 | 13335 | 1 | Y | No | 22 | 4 | 2 | 80 | 1 | 1 | 2 | 3 | 1 | 0 | 0 | 0 |
First off, let's take a look at the dataset
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
data_info = Data_Plot(Data, Title = 'IBM HR Analytics Employee Attrition and Performance Dataset')
def List_Print(Text, List, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + '%s:' % Text + Style.RESET_ALL + ' %s' % ', '.join(List))
def FeatBins(Inp, Bins, replace = True):
Bins = [int(x) for x in Bins]
Out = pd.cut(Inp, bins = pd.IntervalIndex.from_tuples([(x, y) for x, y in zip(Bins[:-1],Bins[1:])]))
Temp = np.sort(Out.astype('str').unique())
Dict = dict(zip(Temp, np.arange(len(Temp))))
if replace:
Out = Out.astype('str').replace(Dict)
else:
Out = Out.astype('str')
try:
Out = Out.str.replace(pat = '(', repl = '[').str.replace(pat = '-1', repl = '0')
except:
pass
return Out
def FeatAgg(Feat, ColorFeat, Target = Target, Inp = Data):
Out = Inp[[Feat, ColorFeat,Target]]
Out = Out.groupby([Feat, ColorFeat,Target])[Target].agg({'count'}).rename(columns = {'count':'Count'})
Out['Percentage'] = np.round(100* Out.values /Out.sum().values, 2)
Out.reset_index(drop = False, inplace = True)
Out = Out.sort_values(by=[Feat])
Out[Feat] = Out[Feat].astype(str)
return Out
def DistPlot(Feat, Target = Target, nbins = 20,
Colors = ['LightSalmon', 'LightBlue'], LC = 'Black',
yLim = [0, 80], H = 450, titleY = 0.92, Inp = Data):
fig = px.histogram(Inp, x = Feat, nbins=nbins, color= Target, marginal= 'box',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
Name = '%s Distribution by %s' % (Target, Feat)
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= H, width= 980,
title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=yLim)
fig.show()
def PlotX(df, Feat, ColorFeat, Target = Target,
Colors = list(mcolors.TABLEAU_COLORS.values()), LC = 'Black',
yLim = [0, 35], H = 500, titleY = 0.90):
# Figure
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True, y_title = 'Percent',
subplot_titles=('%s: <b>No<b>' % Target, '%s: <b>Yes<b>' % Target))
# Left
if Colors == None:
fig1 = px.bar(df.loc[df[Target] == 'No'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns)
else:
fig1 = px.bar(df.loc[df[Target] == 'No'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, row=1, col=1)
# Right
if Colors == None:
fig2 = px.bar(df.loc[df[Target] == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns)
else:
fig2 = px.bar(df.loc[df[Target] == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=1, col=2)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, showlegend = False, row=1, col=2)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= yLim)
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and %s' % (Feat, ColorFeat, Target)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
df = Data.copy()
Categorical_Columns = data_info.loc[data_info['Data Type'] == 'object', 'Features'].tolist()
N = len(Categorical_Columns)
# Progressbar
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval= N, widgets=[progressbar.Bar('=', '|', '|'), progressbar.Percentage()])
#--------------- the loop ----------------------
Progress_Bar.start()
for i in range(N):
le = preprocessing.LabelEncoder()
le.fit(list(df[Categorical_Columns[i]]))
df[Categorical_Columns[i]] = le.transform(df[Categorical_Columns[i]])
del le
Progress_Bar.update(Counter)
Counter+=1
Progress_Bar.finish()
#--------------- End of the loop ---------------
|=========================================================================|100%
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1,
cbar_kws={"aspect":40, "shrink": .4}, annot_kws={"size": 8})
return Correlation_Matrix
CM = Correlation_Plot (df.drop(columns = ['Standard Hours','Employee Count', 'Over 18']), 14)
# Modifying the dataset for plotting only
for Feat in Categorical_Dict.keys():
Data [Feat] = Data[Feat].replace(Categorical_Dict[Feat])
for Feat in Bin_Dict.keys():
Data[Feat] = FeatBins(Inp = Data[Feat], Bins = Bin_Dict[Feat], replace = False)
del Feat
Data['Job Level'] = Data['Job Level'].astype(str)
Feat = 'Age'
Colors = ['Bisque','LightYellow','LightGreen','MediumSeaGreen','DarkGreen']
DistPlot(Feat, yLim = [0, 400], Inp = df)
ColorFeat = 'Job Level'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = Colors)
ColorFeat = 'Monthly Income'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = Colors)
ColorFeat = 'Number Of Companies Worked'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = Colors)
ColorFeat = 'Total Working Years'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = Colors)
ColorFeat = 'Years At Company'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = Colors)
del Feat, ColorFeat
Feat = 'Job Level'
Colors = ['Bisque','LightYellow','LightGreen','MediumSeaGreen','DarkGreen']
ColorFeat = 'Monthly Income'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = Colors)
ColorFeat = 'Total Working Years'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = Colors)
ColorFeat = 'Years At Company'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = Colors)
ColorFeat = 'Years In Current Role'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = Colors)
ColorFeat = 'Years Since Last Promotion'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = Colors)
ColorFeat = 'Years With Current Manager'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = Colors)
del Feat, ColorFeat
Feat = 'Job Level'
Colors = ['Bisque','LightYellow','LightGreen','MediumSeaGreen','DarkGreen']
ColorFeat = 'Total Working Years'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = Colors)
ColorFeat = 'Years At Company'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = Colors)
ColorFeat = 'Years In Current Role'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = Colors)
ColorFeat = 'Years Since Last Promotion'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = Colors)
ColorFeat = 'Years With Current Manager'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = Colors)
del Feat, ColorFeat
Feat = 'Total Working Years'
Colors = ['Bisque','LightYellow','LightGreen','MediumSeaGreen','DarkGreen']
ColorFeat = 'Years At Company'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 60], Colors = Colors)
ColorFeat = 'Years In Current Role'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = Colors)
ColorFeat = 'Years Since Last Promotion'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = Colors)
ColorFeat = 'Years With Current Manager'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = Colors)
del Feat, ColorFeat
Feat = 'Years At Company'
Colors = ['Bisque','LightYellow','LightGreen','MediumSeaGreen','DarkGreen']
ColorFeat = 'Years In Current Role'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = Colors)
ColorFeat = 'Years Since Last Promotion'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 70], Colors = Colors)
ColorFeat = 'Years With Current Manager'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 60], Colors = Colors)
del Feat, ColorFeat
Feat = 'Years In Current Role'
Colors = ['Bisque','LightYellow','LightGreen','MediumSeaGreen','DarkGreen']
ColorFeat = 'Years Since Last Promotion'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = Colors)
ColorFeat = 'Years With Current Manager'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = Colors)
del Feat, ColorFeat